####################################################################################
####### Creating data for analysis 

# This code is not replicable because it uses raw data with identifiable information

# Initial part of the code 
#   Creates the main data set (survey data + admin)
#   Creates subsets of the main set used in each paper
#   Creates anonymous (public) versions of these data sets 
# Second part of the code 
#   Creates datasets for the analysis of attrition

# Attention: This is running with the auxiliary functions in the Replication-WhatYouSee-BJPS folder

####################################################################################

rm(list=ls())
options(scipen = 999) #disabling sci notation

library(tidyverse)
library(readxl)
library(data.table)
library(estimatr)
library(randomizr)
library(genderBR)
library(lubridate)
library(mice)

home <- "" #dir

source(paste0(home, "Replication-WhatYouSee-BJPS/Code/functions.R"))

#Administrative data on beneficiaries #commented out because no need to run this every time 
# if(1==2){#commenting out
# far <- read_excel("~/Dropbox/DB/BENEFICIARIOSMCMV122017.xlsx", sheet = 1,
#                    skip = 1)
#  pnhu <- read_excel("~/Dropbox/MCMV_replication/DB/BENEFICIARIOSMCMV122017.xlsx", sheet = 3,
#                     skip = 1)
#  fds <- read_excel("~/Dropbox/MCMV_replication/DB/BENEFICIARIOSMCMV122017.xlsx", sheet = 2,
#                    skip = 1)
# 
# #Combine and add column about type
#  fds <- fds %>% mutate(type = rep("fds", rep(nrow(fds))))
#  far <- far %>% mutate(type = rep("far", rep(nrow(far))))
#  pnhu  <- pnhu %>% mutate(type = rep("pnhu", rep(nrow(pnhu))))
#  
#  far <- far %>% mutate(Numero_PIS_do_Mutuario = as.character(Numero_PIS_do_Mutuario))
#  mcmv <- far %>% bind_rows(pnhu, fds)
#  rj <- mcmv %>% filter(UF == "RJ" & Nome_Municipio == "RIO DE JANEIRO")
#  table(nchar(rj$Numero_CPF_Mutuario))
#  rj$cpfstd <- sprintf("%011.0f", rj$Numero_CPF_Mutuario)#From numeric to character with fixed number of digits
# #save(rj, file = "~/Dropbox/MCMV_replication/DB/rj_beneficiaries.Rda")
# }#close comment

#Import administrative data
load(paste0(home, "DB/rj_beneficiaries.Rda"))
load(paste0(home, "DB/all_wide_rais_october.Rda"))
dob <- read.csv2(paste0(home, "DB/Serasa-DoB-Survey.csv"))

#Read the "sorteados" file 
load(file=paste0(home,"DB/sorteados_october2018.Rda"))
#Read the "inscritos" file 
load(file=paste0(home,"DB/inscritos_october2018.Rda"))	

#Results from the survey company
results <- read.csv(paste0(home, "DB/pm010_17_r01.csv"), 
                    na.strings=c("","NA"), stringsAsFactors = F)
results <- as_tibble(results)
results <- results %>% mutate(key = codigo_de_listagem)

results2 <- read.csv(paste0(home, "DB/pm010_17_r02.csv"),
                     na.strings=c("","NA"), stringsAsFactors = F)
results2 <- as_tibble(results2)
results2 <- results2 %>% mutate(key = codigo_de_listagem)

save(results, file = paste0(home,"DB/results.Rda"))
save(results2, file = paste0(home,"DB/results2.Rda"))

#Samples sent to survey company
#Sample 1
final_sample <- read.csv(paste0(home, "DB/survey.late.forfield.csv"), stringsAsFactors = F)
final_sample <- as_tibble(final_sample)
final_sample <- final_sample %>% mutate(key = X, sample_id = rep("sample1", nrow(final_sample))) %>% select(-X)

#Sample 2
final_sample2 <- read.csv(paste0(home, "DB/survey.late.forfield_reposicao.csv"), stringsAsFactors = F)
final_sample2 <- as_tibble(final_sample2)
final_sample2 <- final_sample2 %>% mutate(key = seq(1, nrow(final_sample2)), sample_id = rep("sample2", nrow(final_sample2)), 
                                          nu_logradouro_fam = as.character(nu_logradouro_fam)) %>% select(-ids)

all_samples <- bind_rows(final_sample, final_sample2)
save(all_samples, file = paste0(home, "DB/all_samples1_2.Rda"))

#Getting keys and CPFs
keys_cpfs <- all_samples %>% select(key, cpfstd)
length(unique(keys_cpfs$key)) 
length(unique(keys_cpfs$cpfstd))
#Keys match CPF by sample but not overall

#Joining sample 1
survey_late <- final_sample %>% right_join(results, by = "key") %>%
  mutate(survey = rep("late_1", nrow(results)))
complete_interviews1 <- nrow(filter(survey_late, g1 == "Sim"))
survey1 <- filter(survey_late, g1 == "Sim")

#Joining sample 2
survey_late2 <- final_sample2 %>% right_join(results2, by = "key") %>% mutate(survey = rep("late_2", nrow(results2)))
complete_interviews2 <- nrow(filter(survey_late2, g1 == "Sim"))
survey2 <- survey_late2 %>% filter(g1 == "Sim")
survey_late2 <- survey_late2 %>% mutate(q55 = as.character(q55), q88 = as.character(q88),
                                        q89 = as.character(q88), q82 = as.character(q82))

save(survey_late, file = paste0(home,"DB/survey_late.Rda"))
save(survey_late2, file = paste0(home,"DB/survey_late2.Rda"))

#Survey completed with IDs
survey_W1 <- bind_rows(survey1, survey2)

######################################################Creating individual-lottery data

#There are repeated interviews
tmp <- survey_W1 %>% filter(duplicated(cpfstd))
#Excluding repeated from survey
survey_W1 <- survey_W1 %>% filter(!cpfstd %in% tmp$cpfstd)
cpfs_interviewed <- unique(survey_W1$cpfstd)

#Select lotteries inscritos
interviewed_inscritos <- inscritosf %>% mutate(cpfstd2 = as.numeric(cpfstd)) %>% 
  filter(cpfstd2 %in% cpfs_interviewed)
#NA message is not a problem here
stopifnot(length(unique(interviewed_inscritos$cpfstd))==length(cpfs_interviewed))
interviewed_inscritos <- interviewed_inscritos %>% filter(id_edital %in% c("edital17.2016", "edital20.2016"))

table(interviewed_inscritos$id_edital) #smell check

#Selecting lotteries winners in the survey
interviewed_winners <- sorteadosf %>% mutate(cpfstd2 = as.numeric(cpfstd)) %>% 
  filter(cpfstd2 %in% cpfs_interviewed)
interviewed_winners <- interviewed_winners %>% filter(id_edital %in% c("edital17.2016", "edital20.2016"))
table(interviewed_winners$id_edital) #smell check

#Creating TT labels
interviewed_winners <- interviewed_winners %>% mutate(TT1 = case_when(id_edital == "edital17.2016" ~ "T172016", 
                                                      id_edital == "edital20.2016" ~ "T202016")) %>% 
                                              select(cpfstd, cpfstd2, names, TT1, id_edital)

table(interviewed_winners$TT1, interviewed_winners$id_edital) #sanity check

interviewed_inscritos2 <- left_join(interviewed_inscritos, interviewed_winners, by = c("cpfstd", "id_edital"))

nrow(interviewed_inscritos2) == nrow(interviewed_inscritos)

interviewed_inscritos2 <- interviewed_inscritos2 %>% mutate(TT = case_when(id_edital =="edital17.2016" & is.na(TT1) ~ "C172016", 
                                                                id_edital =="edital20.2016" & is.na(TT1) ~ "C202016", 
                                                                id_edital =="edital17.2016" & !is.na(TT1) ~ "T172016", 
                                                                id_edital =="edital20.2016" & !is.na(TT1) ~ "T202016"))
#sanity checks
table(interviewed_inscritos2$TT)
table(interviewed_inscritos2$TT, interviewed_inscritos2$TT1, interviewed_inscritos2$id_edital)
table(interviewed_inscritos2$TT, is.na(interviewed_inscritos2$TT1), interviewed_inscritos2$id_edital)


#Stacking for invidiual-lottery format
data_ind_lottery <- interviewed_inscritos2
stopifnot(length(unique(data_ind_lottery$cpfstd))==800)
data_ind_lottery <- data_ind_lottery %>% select(-cpfstd2.y) %>% mutate(cpfstd2 = cpfstd2.x)

#Merge with survey data
survey_W1 <- survey_W1 %>% mutate(cpfstd2 = cpfstd)
survey_W1_v2 <- data_ind_lottery %>% left_join(survey_W1, by = "cpfstd2")
stopifnot(nrow(survey_W1_v2)==nrow(data_ind_lottery))

#Housekeeping
survey_W1_stacked <- survey_W1_v2 %>% select(-TT1, -cpfstd2.x, - names.y) %>% rename(cpfstd = cpfstd.x)

################################# Adding Rais data
 
#Joining
survey_W1_stacked <- survey_W1_stacked %>% left_join(all_wide_rais_all_editais, by = "cpfstd")
stopifnot(nrow(survey_W1_stacked)==1272)
 
############################################## Adding date of birth date

#Adjust ids to join with dataset
dob$cpfstd <- sprintf("%011.0f", dob$cpf)
dob$dob <- as.Date(as.character(dob$NASCIMENTO_ENR))
dob <- subset(dob, select = c(cpfstd,dob))
survey_W1_stacked <- survey_W1_stacked %>% left_join(dob, by = "cpfstd")

stopifnot(nrow(survey_W1_stacked)==1272)

############################################## Adding CADUNICO (as of 09/2016)

load(paste0(home,"DB/BASE_PESQUISADOR_PESS_092016.RData")) 
cadunico <- data.frame(cpfstd=sprintf("%011.0f",cpfs_interviewed),cadunico=is.element(cpfs_interviewed,p$cpfstd))
survey_W1_stacked <- survey_W1_stacked %>% left_join(cadunico, by = "cpfstd")

##################################### Creating and adding survey weights

#Population from which original sample was drawn
load(paste0(home, "DB/survey.late.population.RData"))

nonwinners <- subset(pop,treat==F)
winners <- subset(pop,treat==T)

# Check distribution
table(nonwinners$ran17,nonwinners$ran20)

#Probability of sampling nonwinners
prob_sampling_nonwinner_round1 <- 4000/nrow(nonwinners)
prob_sampling_winner_round1 <- 1

# Add CPFs to the original sample
load(paste0(home, "DB/survey.late.sampleparaenriquecimento.RData"))

#list of original samples of cpfs with survey weights
the.sample_weights <- the.sample %>% select(cpfstd, names.clean.17, names.clean.20, treat) %>%
  mutate(sample_weights = ifelse(treat == F, prob_sampling_nonwinner_round1, 
                                 ifelse(treat == T, prob_sampling_winner_round1, NA)))

save(the.sample_weights, file = paste0(home,"DB/the.sample_weights.Rda"))
rm(pop, winners, nonwinners, the.sample)

################### Additional Sample Using Reserva

load(paste0(home,"DB/survey.late.population_reposicao.RData"))
nonwinners <- subset(pop_reposicao,treat==F)
winners <- subset(pop_reposicao,treat==T)

#probabilities of being sampled
ratio <- 3975/2161 #sample(1:nrow(nonwinners), nrow(winners)*ratio, replace = FALSE) 
                   #sample size comes from here 3975/2161 * 668 (new winners) = 1228
prob_sampling_nonwinner_round1_reposicaoonly <- 1228/nrow(nonwinners)
prob_sampling_winner_round1_reposicao <- 1 #reserva winners

#combine the two probabilities to produce the actual probability of selection
#prob. of original selection + (1-original) * probability in "reposicao":
prob_sampling_nonwinner_round1_reposicao <- prob_sampling_nonwinner_round1 + (1-prob_sampling_nonwinner_round1)*prob_sampling_nonwinner_round1_reposicaoonly

#Keep CPFS for merge
load(paste0(home, "DB/survey.late.sampleparaenriquecimento_reposicao.RData"))

#list round 1 reposicao of cpfs with survey weights
the.sample_weights_reposicao <- the.sample_reposicao %>% select(cpfstd, names.clean.17, names.clean.20, treat) %>%
  mutate(sample_weights = ifelse(treat == F, prob_sampling_nonwinner_round1_reposicao, 
                                 ifelse(treat == T, prob_sampling_winner_round1_reposicao, NA)))

############## Combining all weights (original and resample) 
survey_weights <- rbind(the.sample_weights,the.sample_weights_reposicao)
survey_weights <- survey_weights %>% rename(sampling_prob=sample_weights)#change name to better reflect variable
table(survey_weights$sampling_prob)

rm(pop_reposicao, winners, nonwinners, the.sample_reposicao,the.sample_weights,the.sample_weights_reposicao,prob_sampling_nonwinner_round1_reposicaoonly)

save(survey_weights, file = paste0(home, "DB/W1_survey_weights.Rda"))

#Joining with survey
survey_W1_stacked <- survey_W1_stacked %>% left_join(survey_weights, by = "cpfstd")
stopifnot(nrow(survey_W1_stacked)==1272)

save(survey_W1_stacked, file = paste0(home, "DB/W1_data_prerecode.Rda"))

##################################### Recoding outcomes and pre-treatment

#Indicator for winning a lottery
survey_W1_stacked <- survey_W1_stacked %>% mutate(Z = ifelse(TT %in% c("T172016", "T202016"), 1, 0), 
                                                  inscreveu = dplyr::recode(q74, `1` = 1, `2` = 0,  .default = as.numeric(NA_character_)), 
                                                  inscreveu_r = dplyr::recode(q74, `1` = 1, `2` = 0, `8` = 0, `9` = 0, .default = as.numeric(NA_character_)), 
                                                  ganhou = dplyr::recode(q80, `1` = 1, `2` = 0, `8` = 0, .default = as.numeric(NA_character_)),
                                                  ganhou_r = dplyr::recode(q80, `1` = 1, `2` = 0, `8` = 0, `9` = 0,  .missing = 0))
                                                  #ganhou_r includes as 0 (did not learn about winning) those who said they
                                                  #had not enrolled in MCMV, those who did not learn about any result
                                                  #and those who learned they did not win. Hence equates not learning with 
                                                  #with learning they didn't win. 

#Outcomes and pre-treatment covariates
survey_W1_stacked <- survey_W1_stacked %>% mutate(votept_2014_d = dplyr::recode(q61, `1` = 1, `2` = 0, `3` = 0, `6` = 0, `7` = 0, .default = as.numeric(NA_character_)), 
                            voteinc_2016_d = dplyr::recode(q59, `3` = 1, `1` = 0, `2` = 0, `4` = 0, `5` = 0, `6` = 0, `7` = 0, 
                                                        .default = as.numeric(NA_character_)),
                            turnout_2014 = dplyr::recode(q60, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            turnout_2016 = dplyr::recode(q56, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            lula_eval = dplyr::recode(q48, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)), 
                            dilma_eval = dplyr::recode(q47, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            temer_eval = dplyr::recode(q49, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            paes_eval = dplyr::recode(q50, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)), 
                            pezao_eval = dplyr::recode(q51, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            party_aff = dplyr::recode(q52, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            party_pt = dplyr::recode(q53, `1` = 1, `2` = 0, `3` = 0, `7` = 0, `8` = 0, 
                                                     `9` = 0, `10` = 0, `11` = 0, `66` = 0, .default = as.numeric(NA_character_)),
                            party_pt_r = dplyr::recode(q53, `1` = 1, `2` = 0, `3` = 0, `7` = 0, `8` = 0, 
                                                       `9` = 0, `10` = 0, `11` = 0, `66` = 0, .missing = 0),
                            party_pmdb = dplyr::recode(q53, `1` = 0, `2` = 1, `3` = 0, `7` = 0, `8` = 0, 
                                                       `9` = 0, `10` = 0, `11` = 0, `66` = 0, .default = as.numeric(NA_character_)), 
                            party_pdt = dplyr::recode(q53, `1` = 0, `2` = 0, `3` = 0, `7` = 0, `8` = 0, 
                                                      `9` = 1, `10` = 0, `11` = 0, `66` = 0, .default = as.numeric(NA_character_)),
                            party_psol = dplyr::recode(q53, `1` = 0, `2` = 0, `3` = 0, `7` = 1, `8` = 0, 
                                                       `9` = 1, `10` = 0, `11` = 0, `66` = 0, .default = as.numeric(NA_character_)), 
                            party_anti = dplyr::recode(q54, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            anti_pt_r = as.numeric(grepl("01",survey_W1_stacked$q55)), #fulcrum allowed multiple responses, so look for "01"
                            client_g = dplyr::recode(q57, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            client_ind = dplyr::recode(q58, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            cantril = ifelse(q8 < 11, q8, NA), 
                            q10r = dplyr::recode(q10, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            q9_1r = dplyr::recode(q9_1, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            q9_2r = dplyr::recode(q9_2, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            q9_3r = dplyr::recode(q9_3, `1` = -1, `2` = 0, .default = as.numeric(NA_character_)),
                            q9_4r = dplyr::recode(q9_4, `1` = -1, `2` = 0, .default = as.numeric(NA_character_)),
                            emotions = q9_1r + q9_2r + q9_3r + q9_4r,                             
                            placement = ifelse(q36 < 11, q36, NA), 
                            govind  = dplyr::recode(q11, `2` = 1, `1` = 0, `3` = 0, .default = as.numeric(NA_character_)), 
                            competition  = dplyr::recode(q12, `1` = 1, `2` = 0, `3` = 0, .default = as.numeric(NA_character_)), #what to do about indiferente?
                            q34r = dplyr::recode(q34, `1` = 2, `2` = 1, `5` = -2, `8` = -1, `3` = 0,
                                                 `4` = 0, .default = as.numeric(NA_character_)), #nao sei as 0
                            q35r = dplyr::recode(q35, `1` = 2, `2` = 1, `4` = -2, `8` = -1,
                                                 `3` = 0, .default = as.numeric(NA_character_)),
                            q38r = dplyr::recode(q38, `1` = 2, `2` = -2, `3` = 0, .default = as.numeric(NA_character_)),
                            prosp_self = dplyr::recode(q37, `1` = 1, `2` = -1, `3` = 0, .default = as.numeric(NA_character_)), #what to do with NS?
                            dream = dplyr::recode(q38a, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            #In the next items, high values mean right wing (lower taxes or less redistribution)
                            tax_item = dplyr::recode(q43, `1` = 1, `2` = -1, `3` = 0, .default = as.numeric(NA_character_)),
                            taxes_fair = dplyr::recode(q44, `0` = 10, `1` = 9, `2` = 8, `3` = 7, `4` = 6,
                                                 `5` = 5, `6` = 4, `7` = 3, `8` = 2, `9` = 1, `10` = 0, 
                                                 .default = as.numeric(NA_character_)),#formerly 44_r
                            red_abstract = dplyr::recode(q45, `2` = 1, `1` = 0, .default = as.numeric(NA_character_)), 
                            red_concrete = dplyr::recode(q46, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            red_concrete_v2 = dplyr::recode(q46, `1` = 1, `2` = -1, .default = as.numeric(NA_character_)),
                            #For components of the market index, high values mean pro-market/right wing
                            effortbetter = dplyr::recode(q13, `1` = 1, `2` = 1, `3` = 0, 
                                                 `4` = 0, `5` = 0, `6` = 0, .default = as.numeric(NA_character_)),
                            trustothers = dplyr::recode(q14, `1` = 1, `2` = 0, `3` = 0, .default = as.numeric(NA_character_)),
                            successalone = dplyr::recode(q15, `1` = 1, `2` = 0, `3` = 0, .default = as.numeric(NA_character_)),
                            moneyimportant = dplyr::recode(q16, `1` = 1, `2` = 0, `3` = 0, .default = as.numeric(NA_character_)),   
                            #Assoc
                            assoc = dplyr::recode(q7, `1` = 2, `2` = 1, `3` = -1, `4`= -2,  
                                                  .default = as.numeric(NA_character_)),
                            religiao_dummy = dplyr::recode(religiao, `1` = 1, `2` = 1, `3` = 1, `4` = 1, `5` = 1, `8` = 1, `6`=0, `7`=0, .default = as.numeric(NA)),    
                            religiao_prot = dplyr::recode(religiao, `1` = 0, `2` = 1, `3` = 1, `4` = 0, `5` = 0, `8` = 0, `6`=0, `7`=0, .default = as.numeric(NA)), 
                            sexor = dplyr::recode(sexo, `1` = 1, `2` = 0), data_nasc = ymd(nasc),
                            idade = interval(start = dob, end = ymd("2018-02-02"))/duration(num = 1, units = "years"), 
                            idadeW2 = interval(start = dob, end = ymd("2020-05-15"))/duration(num = 1, units = "years"),
                            idade_old = idade > median(idade,na.rm=T),
                            race_bin = dplyr::recode(cor, `1` = 1, `2` = 0, `3` = 0, `4` = 0, `5` = 0,
                                                     .default = as.numeric(NA)),
                            sch_years = dplyr::recode(esc, `1` = 0, `2` = 1, `3` = 4, `4` = 8, `5` = 11, `6` = 12, `7` = 15, 
                                                      .default = as.numeric(NA)),
                            sch_high = dplyr::recode(esc, `1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 1, `6` = 1, `7` = 1, 
                                                     .default = as.numeric(NA)),
                            criancas_moram = q2,
                            work_status =  ifelse(q26>7,NA,q26),
                            work_activity = ifelse(q28>7,NA,q28),
                            pea = ifelse(q27 == 1, 1, 0),
                            inc_r = ifelse(rendaf > 7, NA, rendaf),
                            inc_above1mw = as.numeric(inc_r>1),
                            commute = as.numeric(as.duration(hm(survey_W1_stacked$q32, quiet = T)),"minutes"),
                            commute = ifelse(commute < 1439, commute, NA),
                            #there is more missing
                            commute_d = ifelse(commute < median(commute, na.rm = T), 0 , 1),
                            commute_mode = ifelse(q33 > 10, NA, q33),
                            homeowner = dplyr::recode(q17, `1` = 1, `2` = 0, `3` = 0, `4` = 0, `5` = 0, 
                                           .default = as.numeric(NA_character_)),
                            homerenter = dplyr::recode(q17, `1` = 0, `2` = 1, `3` = 0, `4` = 0, `5` = 0, 
                                           .default = as.numeric(NA_character_)),
                            homesquat = dplyr::recode(q17, `1` = 0, `2` = 0, `3` = 1, `4` = 1, `5` = 1, 
                                           .default = as.numeric(NA_character_)),                          
                            homestatus = factor(dplyr::recode(q17, `1` = 0, `2` = 1, `3` = 2, `4` = 2, `5` = 2, 
                                           .default = as.numeric(NA_character_))),
                            #originally, we only had "rent_spend", for symmetry with W2 I adapted to to spend_home
                            spend_rent = ifelse(c(q18 == 8888 | q18 == 9999), NA, q18),
                            spend_cond = ifelse(q20 == 999, NA, q20),
                            spend = rowSums(cbind(spend_rent,spend_cond),na.rm=T), 
                            spend_home = ifelse(is.na(spend_rent) & is.na(spend_cond), NA, spend),
                            #variables for satisfaction with current home
                            satisf_size = ifelse(q22_1==99,NA,q22_1),
                            satisf_location = ifelse(q22_2==99,NA,q22_2),
                            satisf_cost = ifelse(q22_3==99,NA,q22_3),
                            #variables for satisfaction with neighborhood
                            satisfn_violence = ifelse(q23_1==99,NA,q23_1),
                            satisfn_lights = ifelse(q23_2==99,NA,q23_2),
                            satisfn_transp = ifelse(q23_3==99,NA,q23_3),
                            satisfn_health = ifelse(q23_4==99,NA,q23_4),
                            satisfn_educ = ifelse(q23_5==99,NA,q23_5),
                            netwk_family = dplyr::recode(q24, `1` = 2, `2` = 1, `3` = -1, `4` = -2,  
                                                         .default = as.numeric(NA_character_)),            
                            netwk_neighb = dplyr::recode(q25, `1` = 2, `2` = 1, `3` = -1, `4` = -2,  
                                                         .default = as.numeric(NA_character_)),                
                            improved_lives = dplyr::recode(q62, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, 
                                                     .default = as.numeric(NA_character_)),
                            q63r = ifelse(q63 < 80, q63, NA), 
                            resp_lula = ifelse(q63r == 3, 1, 0),
                            resp_lula_dilma = ifelse(q63r == 3 |q63r == 2, 1, 0),
                            resp_lula_dilma_pt = ifelse(q63r == 3 |q63r == 2 | q63r == 8, 1, 0), 
                            approve_lottery = dplyr::recode(q64, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, 
                                                            .default = as.numeric(NA_character_)),
                            q65r = ifelse(q65 < 8, q65, NA),#até aqui ok
                            continue_temer = dplyr::recode(q65r, `1` = 2, `2` = 1, `3` = -1, `4` = -2, 
                                                               .default = as.numeric(NA_character_)),
                            q66r = ifelse(q66 < 8, q66, NA),
                            continue_lula = dplyr::recode(q66r, `1` = 2, `2` = 1, `3`=-1, `4` = -2,  
                                                                .default = as.numeric(NA_character_)),
                            q67r = ifelse(q67 < 8, q67, NA),
                            increase_lula = dplyr::recode(q67r, `1` = 1, `3` = 0, `2` = -1, 
                                                              .default = as.numeric(NA_character_)),
                            q68r = ifelse(q68 < 8, q68, NA),
                            continue_alckimin = dplyr::recode(q68r, `1` = 2, `2` = 1, `3`=-1, `4` = -2, 
                                                                    .default = as.numeric(NA_character_)),
                            q69r = ifelse(q69 < 8, q69, NA),
                            increase_alckimin = dplyr::recode(q69r, `1` = 1, `3` = 0,  `2` = -1,  
                                                                  .default = as.numeric(NA_character_)),
                            learned_how = as.numeric(ifelse(q75 < 10, q75, NA)),
                            learned_friendsfamily = as.numeric(learned_how == 1),
                            learned_media =  as.numeric(learned_how == 2|learned_how == 6|learned_how == 7),
                            learned_networks = as.numeric(learned_how == 3|learned_how == 4|learned_how == 5),
                            #Would move combines winners and losers
                            #Note that one respondent that answered 5 declined because she had a Faixa II ap. 
                            #Its the treated one not living in Saboia/Sepetiba
                            q81r = dplyr::recode(q81, `1` = 1, `2` = -1, `3` = 0, `4`= 0,
                                                 `5`=1, .default = as.numeric(NA_character_)),
                            moved = ifelse(is.na(q81),0,ifelse(q81==5,1,0)),
                            q87r = dplyr::recode(q87, `1` = 1, `2` = -1, `3` = 0, `4`= 0,
                                                 .default = as.numeric(NA_character_)),
                            would_move = ifelse(is.na(q87r),q81r,q87r),
                            hope_better_life = dplyr::recode(q90, `1` = 1, `2` = 0, `3` = 0, `4`= 0,
                                                             .default = as.numeric(NA_character_)),
                            rr2003 = ifelse(is.na(`2003`), 0, `2003`),
                            rr2004 = ifelse(is.na(`2004`), 0, `2004`),
                            rr2005 = ifelse(is.na(`2005`), 0, `2005`),
                            rr2006 = ifelse(is.na(`2006`), 0, `2006`),
                            rr2007 = ifelse(is.na(`2007`), 0, `2007`),
                            rr2008 = ifelse(is.na(`2008`), 0, `2008`),
                            rr2009 = ifelse(is.na(`2009`), 0, `2009`),
                            rr2010 = ifelse(is.na(`2010`), 0, `2010`),
                            r2010 = ifelse(is.na(`2010`), 0, `2010`),
                            r2011 = ifelse(is.na(`2011`), 0, `2011`),
                            r2012 = ifelse(is.na(`2012`), 0, `2012`),
                            r2013 = ifelse(is.na(`2013`), 0, `2013`),
                            r2014 = ifelse(is.na(`2014`), 0, `2014`),
                            rr2003l = log(1+rr2003),
                            rr2004l = log(1+rr2004),
                            rr2005l = log(1+rr2005),
                           rr2006l = log(1+rr2006),
                            rr2007l = log(1+rr2007),
                            rr2008l = log(1+rr2008),
                            rr2009l = log(1+rr2009),
                            rr2010l = log(1+rr2010),
                            r2010l = log(1+r2010),
                            r2011l = log(1+r2011),
                            r2012l = log(1+r2012),
                            r2013l = log(1+r2013),
                            r2014l = log(1+r2014), 
                           #additional recode for BJPS
                           party_mdb_r = dplyr::recode(q53, `10` = 1, `1` = 0,
                                                       `2` = 1, `3` = 0, `4` = 0, `5` = 0, 
                                                       `6` = 0, `7` = 0, `8` = 0,`9` = 0,`11` = 0, `12` = 0
                                                       ,`66` = 0, `88`=0, `99`=0, .missing  = as.double(0)),
                           resp_paes = ifelse(q63r == 6, 1, 0),
                           resp_paes_pezao_temer = ifelse(q63r == 6 |q63r == 1 |q63r == 5, 1, 0),
                           resp_paes_pezao_temer_mdb = ifelse(q63r == 6 |q63r == 1 |q63r == 5 | q63r == 10, 1, 0), 
                           resp_know_dontknow = ifelse(q63 %in% c(88, 99), 0, 1))#end rename

# Drop individuals who had moved ###
comment.text <- paste("Dropped ",sum(survey_W1_stacked$moved,na.rm=T)
                      ," (of ",nrow(survey_W1_stacked)
                      ,") individual/lottery observations who reported having moved prior to survey.\n",sep="")
survey_W1_stacked <- subset(survey_W1_stacked,moved!=1)
cat("Keeping",nrow(survey_W1_stacked),"individual/lottery observations.\n")

# Create summary RAIS variables pre-treatment (2010-2014)
yrs <- length(grep("^r2\\d\\d\\d$",names(survey_W1_stacked)))
survey_W1_stacked$av_formalinc_log <- log(1+rowSums(survey_W1_stacked[,grep("^r2\\d\\d\\d$",names(survey_W1_stacked))],na.rm=T)/yrs)
survey_W1_stacked$formal <- rowSums(survey_W1_stacked[,grep("^r2\\d\\d\\d$",names(survey_W1_stacked))]!=0)
survey_W1_stacked$formal_prop <- rowSums(survey_W1_stacked[,grep("^r2\\d\\d\\d$",names(survey_W1_stacked))]!=0)/yrs

yrs <- length(grep("^rr2\\d\\d\\d$",names(survey_W1_stacked)))
survey_W1_stacked$av_prel <- log(1+rowSums(survey_W1_stacked[,grep("^rr2\\d\\d\\d$",names(survey_W1_stacked))],na.rm=T)/yrs)
survey_W1_stacked$formal_pre <- rowSums(survey_W1_stacked[,grep("^rr2\\d\\d\\d$",names(survey_W1_stacked))]!=0)
survey_W1_stacked$formal_pre_prop <- rowSums(survey_W1_stacked[,grep("^rr2\\d\\d\\d$",names(survey_W1_stacked))]!=0)/yrs

#Creating the Kahneman & Deaton positive affect and blue affect indices
#This is not a normalized index, but we also employ imputation
kahndeat_index_vars <- survey_W1_stacked %>% select(q9_1r,q9_2r,q10r,q9_3r,q9_4r)
kahndeat_index_vars <- imp.mice(kahndeat_index_vars)
survey_W1_stacked$kahndeat_pos  <- apply(subset(kahndeat_index_vars,select=c(q9_1r,q9_2r,q10r)), 1 ,mean)
survey_W1_stacked$kahndeat_blue  <- -1*apply(subset(kahndeat_index_vars,select=c(q9_3r,q9_4r)), 1 ,mean) 
rm(kahndeat_index_vars)


# Create Standardized Indices 
# We create standardized additive indices after imputing missing observations
# We employ multiple imputation within the set of items of each index
# If all items are missing, we drop
exp_index_vars <- imp.mice(survey_W1_stacked %>% select(prosp_self, q34, q35, q38))#the last three are mostly missing
      #PAP predicted the definition above, but q34, q35 and q38 were only applicable to 
      #households with children, so there would be more than 600 NAs in each, all in the same
      #obervations. Hence, we shifted the exp_index_vars in W1 to reflect just pros_self (see below)
      #In W2, we added a sociotropic version of expectations to the index. 
red_index_vars <- imp.mice(survey_W1_stacked %>% select(tax_item, red_abstract, red_concrete)) 
red_index_vars_v2 <- imp.mice(survey_W1_stacked %>% select(tax_item, red_abstract, red_concrete_v2)) # tax_item + red_abstract + red_concrete,
    # W2 PAP predicted tax_item as a separate outcome, and predicted three components to redistribution.
    # Cant'have it both ways, so we decided to drop the tax_item as a separate item:
market_index_vars <- imp.mice(survey_W1_stacked %>% select(effortbetter,
                                                           trustothers,
                                                           successalone,
                                                           moneyimportant)) #market_index = q13r + q14r + q15r + q16r,
self_index_vars <- imp.mice(survey_W1_stacked %>% select(govind, competition)) #self_index = q11r + q12r, 
hap_index_vars <- imp.mice(survey_W1_stacked %>% 
                             select(kahndeat_pos, kahndeat_blue, cantril) %>%
                             mutate(kahndeat_blue=-1*kahndeat_blue)) #invert blue so that polarity is the same for all
satisfh_index_vars <- imp.mice(survey_W1_stacked %>% select(satisf_size,satisf_location,satisf_cost))#one item is different from W2
satisfn_index_vars <- imp.mice(survey_W1_stacked %>% select(satisfn_violence,satisfn_transp
                                                 ,satisfn_health,satisfn_educ))

survey_W1_stacked <- survey_W1_stacked %>% bind_cols(
  red_index = calculate_mean_effects_index(Z = survey_W1_stacked$Z
                                           , outcome_mat = red_index_vars, greedy = TRUE ), 
  red_index_v2 = calculate_mean_effects_index(Z = survey_W1_stacked$Z
                                           , outcome_mat = red_index_vars_v2, greedy = TRUE ), 
  market_index = calculate_mean_effects_index(Z = survey_W1_stacked$Z
                                              , outcome_mat = market_index_vars, greedy = TRUE ), 
  self_index = calculate_mean_effects_index(Z = survey_W1_stacked$Z      
                                            , outcome_mat = self_index_vars, greedy = TRUE ),
  hap_index =  calculate_mean_effects_index(Z = survey_W1_stacked$Z
                                            , outcome_mat = hap_index_vars,  greedy = TRUE ), 
  exp_index = survey_W1_stacked$prosp_self,#see explanation, a few lines above
  satisfh_index = calculate_mean_effects_index(Z = survey_W1_stacked$Z
                                               , outcome_mat = satisfh_index_vars,  greedy = TRUE ),
  satisfn_index = calculate_mean_effects_index(Z = survey_W1_stacked$Z
                                               , outcome_mat = satisfn_index_vars, greedy = TRUE ))

rm(red_index_vars,red_index_vars_v2,
   market_index_vars,self_index_vars
   ,hap_index_vars,exp_index_vars
   ,satisfh_index_vars,satisfn_index_vars)

#Inputation of Covariates
#Checking if missing exceeds 10%
prop.table(table(is.na(survey_W1_stacked$religiao_prot)))*100
prop.table(table(is.na(survey_W1_stacked$religiao_dummy)))*100
prop.table(table(is.na(survey_W1_stacked$race_bin)))*100
prop.table(table(is.na(survey_W1_stacked$sch_high)))*100
prop.table(table(is.na(survey_W1_stacked$sexor)))*100#no missing
prop.table(table(is.na(survey_W1_stacked$formal)))*100#no missing
prop.table(table(is.na(survey_W1_stacked$av_formalinc_log)))*100#no missing

survey_W1_stacked$religiao_prot <- imp.meanmode(survey_W1_stacked$religiao_prot)#inputing the mode
survey_W1_stacked$religiao_dummy <- imp.meanmode(survey_W1_stacked$religiao_dummy)#inputing the mode
survey_W1_stacked$race_bin  <- imp.meanmode(survey_W1_stacked$race_bin)
survey_W1_stacked$sch_high <- imp.meanmode(survey_W1_stacked$sch_high)
survey_W1_stacked$sch_years <- imp.meanmode(survey_W1_stacked$sch_years)

############# Computing variables for heterogeneous effects (of waiting) 

# Check in which project winners eventually signed a contract
survey_W1_stacked <- survey_W1_stacked %>% left_join(subset(rj,select=c(cpfstd,Nome_Empreendimento,Data_da_Assinatura)),by="cpfstd")
    table(survey_W1_stacked$Nome_Empreendimento,survey_W1_stacked$Z)
    table(survey_W1_stacked$Data_da_Assinatura)  

tmp <-  na.omit(data.frame(as.Date(survey_W1_stacked$created_at),
                       as.Date(survey_W1_stacked$Data_da_Assinatura),
                       dif=as.numeric(as.Date(survey_W1_stacked$created_at)-as.Date(survey_W1_stacked$Data_da_Assinatura)),
                       moved= survey_W1_stacked$q81,
                       Z=survey_W1_stacked$Z))

#Create a flag for treated individuals who already moved  
survey_W1_stacked$moved_admin <- survey_W1_stacked$moved <-F
survey_W1_stacked$moved_admin[which(is.na(survey_W1_stacked$Data_da_Assinatura)==F)] <- T
survey_W1_stacked$moved[which(survey_W1_stacked$q81==5)] <- T
      table(Survey_Moved=survey_W1_stacked$moved,Admin_Moved=survey_W1_stacked$moved_admin)

# Start by recoding the one case of ex-post treatment as a control cases
# This person won a posterior lottery. As her lottery is not in sample, she has Z=0 and D=0
# She has a win.date, even tough all controls should be NA on this variable
# survey$win.date[which(survey$Z==F)] <- NA

# Compute days from interviews to winning
# Ignoring anybody who won a lottery after the two we're looking at. 
survey_W1_stacked$win.date <- as.Date(ifelse(survey_W1_stacked$sorteio17.2016,"2016-10-22",
                              ifelse(survey_W1_stacked$sorteio20.2016,"2016-11-05",NA)))

survey_W1_stacked$days.since <- as.numeric(as.Date(survey_W1_stacked$created_at) - survey_W1_stacked$win.date)

#Turn into NA in "days.since" the few cases of treated individuals that had already moved by interview
survey_W1_stacked$days.since[which(survey_W1_stacked$moved==T)]<-NA

#Quantiles of days since  
survey_W1_stacked$days.q <- cut(survey_W1_stacked$days.since
				,breaks = quantile(survey_W1_stacked$days.since, probs = seq(0,1, by=0.25), na.rm=T), 
                  include.lowest = TRUE, labels = F)  
 
#Replace controls (which are NA's in days.since) with 0, so that observations are not dropped in aanalysis
survey_W1_stacked$days.q[which(survey_W1_stacked$Z==0)]<-0

#Creating combined indicator
survey_W1_stacked$treatwait <- ifelse(survey_W1_stacked$days.q=="0","_ctrl"
                            ,ifelse(survey_W1_stacked$days.q=="1","Z_1recent","Z_2waiting"))
table(survey_W1_stacked$Z, survey_W1_stacked$treatwait, useNA="always")
by(survey_W1_stacked$days.since,survey_W1_stacked$treatwait,mean,na.rm=T)

#Read Geocoding and Distances 
load(paste0(home, 'DB/enderecos_survey_geolocalizados.RData'))
dg <- subset(dg,select=c(cpfstd,residencial,cidade,geo.lat,geo.long,geo.cosdist))#drop addresses etc (reduce object size)
dg <- dg[-which(duplicated(dg$cpfstd)),]#eliminate duplicate entry in the address list
survey_W1_stacked <- merge(survey_W1_stacked,dg,by="cpfstd",all.x=T)
rm(dg)

#Replace NAs with controls in residencial (name of the project, control groups does not have this infor)
survey_W1_stacked$residencial[which(is.na(survey_W1_stacked$residencial))]<-"_ctrl"

#winsorize distance to 75km (up to Niterói); ALternatively, make them NA or use previous address
# #alternative would be drop or code previous address in Rio. 
survey_W1_stacked$geo.cosdist[which(survey_W1_stacked$geo.cosdist>75000)]<-75000

#Code far away and near places (at median)
survey_W1_stacked$treatdist <- ifelse(survey_W1_stacked$Z==0,"_ctrl",
                               ifelse(survey_W1_stacked$geo.cosdist<=median(survey_W1_stacked$geo.cosdist,na.rm=T),"Z_1near","Z_2far"))
                                 
#Replace NAs with zero for regressions (control groups has no distance)
survey_W1_stacked$geo.cosdist[which(survey_W1_stacked$Z==0)]<-0
 
##################################### Adding administrative compliance data

tel201617 <- read_excel(path=paste0(home, "DB/sorteados_2016_17.xlsx"))
tel201617$edital <- 201617
tel201620 <- read_excel(path=paste0(home, "DB/sorteados_2016_20.xlsx"))
tel201620$edital <- 201620

telegrams <- rbind(tel201617,tel201620)
telegrams <- telegrams %>% rename(delivered='telegrama entregue',
                     delivery_date='quando o telegrama foi entregue',
                     ineligible='não enquadramento',
                     ineligibility_reason='razão não enquadramento',
                     withdrawal='desistência') %>%  select('IMG#',edital,cpfstd,delivered,delivery_date,ineligible,ineligibility_reason,withdrawal)

#correct telegrams that were actually delivered (we checked the images)
telegrams$delivered[which(telegrams$delivered!="sim"&is.na(telegrams$delivery_date)==F)]<-"sim"
telegrams$delivered[which(telegrams$delivered=="não sim")]<-"sim"

#missing CPFS and fix for merging
table(Missing.CPF=is.na(telegrams$cpfstd),Delivered=telegrams$delivered)
telegrams <- subset(telegrams,is.na(cpfstd)==F)#drop 119 cases without CPF (not in our sample anyway)
telegrams$cpfstd[which(telegrams$cpfstd=="não")] <- "11285534735"
telegrams$cpfstd <- sprintf("%011.0f", as.numeric(telegrams$cpfstd))

#recode the variables to logcal and filter only those absolutely necessary
telegrams <- telegrams %>% mutate(delivered_r=ifelse(telegrams$delivered=="não"|is.na(telegrams$delivered),0,1),
                                  delivered=dplyr::recode(delivered, `sim` = 1, `não` = 0, .default = as.numeric(NA_character_)),
                                  ineligible=dplyr::recode(ineligible, `sim` = 1, .missing = 0),
                                  withdrawal=dplyr::recode(withdrawal, `sim` = 1, .missing = 0))  %>% select(cpfstd,delivered,delivered_r,ineligible,withdrawal)

#merge the telegram data back into the data
survey_W1_stacked <- merge(survey_W1_stacked,telegrams,by="cpfstd",all.x=T)

#recode delivered (so that the control group receives zeros)
survey_W1_stacked$delivered_r[which(survey_W1_stacked$Z==0)]<-0
survey_W1_stacked$ganhou_composite <- survey_W1_stacked$ganhou_r==1|survey_W1_stacked$delivered_r==1

#smell check
table(Composto=survey_W1_stacked$ganhou_composite,Z=survey_W1_stacked$Z,useNA="always")
table(Ganhou=survey_W1_stacked$ganhou_r==1,Z=survey_W1_stacked$Z,useNA="always")

#First lottery
  
#Get first year of participation
lots <- survey_W1_stacked %>% select(edital03.2011:edital16.2018, cpfstd)
lots <- unique(lots)
cols <- c("edital03.2011", "edital06.2011", "edital09.2011", "edital12.2012", "edital03.2013", "edital06.2013",
          "edital01.2014", "edital01.2015", "edital04.2015", "edital07.2015", "edital13.2015", "edital14.2015",
          "edital15.2015", "edital18.2015", "edital19.2015", "edital26.2015", "edital27.2015", "edital28.2015",
          "edital29.2015", "edital30.2015", "edital31.2015", "edital34.2015", "edital36.2015", "edital37.2015",
          "edital03.2016", "edital04.2016", "edital05.2016", "edital06.2016", "edital07.2016", "edital10.2016",
          "edital13.2016", "edital14.2016", "edital17.2016", "edital18.2016", "edital19.2016", "edital20.2016",
          "edital05.2017", "edital06.2017", "edital07.2017", "edital08.2017", "edital09.2017", "edital10.2018",
          "edital11.2018", "edital12.2018", "edital13.2018", "edital14.2018", "edital15.2018", "edital16.2018") 

lots$pattern <- apply(lots[ ,cols], 1, paste , collapse = "")

lots <- lots %>% mutate(first_location = str_locate(pattern, "1")[,1])
lots$edital_first <- names(lots)[lots$first_location]  

#Datas sorteios
#Two dates for sorteios that seemed to have taken place in 2015 are in 2016
datas <- read.csv(paste0(home, "DB/datas_temp.csv"))

#Merging with sorteio
lots <- lots %>% mutate(edital_id = edital_first)
lots <- lots %>% left_join(datas, by = "edital_id")
nrow(lots)==800 #review 

#Getting time
lots <- lots %>% mutate(dates = dmy(as.character(Data.Sorteio))) %>% select(cpfstd, dates, pattern, edital_first)

#Joining with survey data
survey_W1_stacked <- survey_W1_stacked %>% left_join(lots, by = "cpfstd")
  
#Getting time

survey_W1_stacked <- survey_W1_stacked %>% mutate(wait_time_all = interval(start = dates, end = ymd("2016-10-22"))/duration(num = 1, units = "years")) 

#Anonymize
survey_W1_stacked$fieldID #redacted #create fake ids

to.drop <- unique(c(grep("cpf",names(survey_W1_stacked))
               ,grep("name",names(survey_W1_stacked))
               ,grep("name",names(survey_W1_stacked))
               ,grep("_enr",names(survey_W1_stacked))
               ,grep("_fam$",names(survey_W1_stacked))
               ,grep("treat\\.",names(survey_W1_stacked))
               ,which(is.element(names(survey_W1_stacked),c("cidade","geo.lat","geo.long","latitude","longitude","facebook","wpp")))))
  
survey <- survey_W1_stacked[,-to.drop]

comment(survey) <- paste("Dataset created in ",as.Date(Sys.Date()),".\n",comment.text,sep="")
save(survey, file = paste0(home, "DB/surveyW1.RData"))

######### Creating the BJPS dataset

survey <- survey %>% dplyr::select(fieldID, Z, id_edital, sexor, TT, 
                                   race_bin, religiao_dummy,
                                   idade,
                                   criancas_moram, sch_years, cadunico,
                                   formal, av_formalinc_log, 
                                   ganhou_composite, 
                                   ganhou_r,
                                   pea,
                                   inc_above1mw, 
                                   prosp_self, 
                                   sampling_prob, treatwait, 
                                   rr2003,
                                   rr2004,
                                   rr2005,
                                   rr2006,
                                   rr2007,
                                   rr2008,
                                   rr2009,
                                   rr2010, 
                                   idadeW2, formal_pre, av_prel, 
                                   edital03.2011, edital06.2011, edital17.2016, edital20.2016, 
                                   party_mdb_r,
                                   resp_lula_dilma_pt,
                                   resp_paes,
                                   resp_paes_pezao_temer, 
                                   resp_paes_pezao_temer_mdb, 
                                   resp_know_dontknow, 
                                   lula_eval, dilma_eval, party_pt_r,
                                   dilma_eval, temer_eval, paes_eval, 
                                   paes_eval, pezao_eval, voteinc_2016_d, 
                                   wait_time_all, improved_lives, dream)


save(survey, file = paste0(home, "Replication-WhatYouSee-BJPS/Data/surveyW1-WhatYouSee.Rda"))

########################### Creating attrition analysis dataset

load(paste0(home, "DB/W1_survey_weights.Rda")) #use if not in memory

#Joining sample 1
results1 <- left_join(results, final_sample, by = "key") 

#Joining sample 2
results2_1 <- left_join(results2, final_sample2, by = "key") 
results2_1  <- results2_1  %>% mutate(q55 = as.character(q55), q88 = as.character(q88),
                                      q89 = as.character(q88), q82 = as.character(q82))


#Combining
results_f <- bind_rows(results1, results2_1) %>% mutate(cpfstd = sprintf("%011.0f", cpfstd))

#create individual-lottery format

#Removing NA CPFSTD
results_f2 <- results_f %>% filter(!is.na(results_f$cpfstd))
#Making CPF standard
attempted <- unique(results_f2$cpfstd) 

### Read the "sorteados" file #if not in memory
load(file=paste0(home,"DB/sorteados_october2018.Rda"))
### Read the "inscritos" file #if not in memory	
load(file=paste0(home,"DB/inscritos_october2018.Rda"))	

#Select lotteries inscritos
attempted_inscritos <- inscritosf %>% filter(cpfstd %in% attempted)
stopifnot(length(unique(attempted_inscritos$cpfstd))==length(attempted)) 
attempted_inscritos <- attempted_inscritos %>% filter(id_edital %in% c("edital17.2016", 
                                                                       "edital20.2016"))

table(attempted_inscritos$id_edital) #smell check

#Selecting lotteries winners in the attempts
attempted_winners <- sorteadosf %>% filter(cpfstd %in% attempted)
attempted_winners <- attempted_winners %>% filter(id_edital %in% c("edital17.2016", 
                                                                   "edital20.2016"))
table(attempted_winners$id_edital) #smell check

#Creating TT labels
attempted_winners <- attempted_winners %>% mutate(TT1 = case_when(id_edital == "edital17.2016" ~ "T172016", 
                                                                  id_edital == "edital20.2016" ~ "T202016")) %>% 
  select(cpfstd, names, TT1, id_edital)

attempted_inscritos2 <-  left_join(attempted_inscritos, attempted_winners, by = c("cpfstd", "id_edital"))
nrow(attempted_inscritos2) == nrow(attempted_inscritos) 

attempted_inscritos2 <- attempted_inscritos2 %>% mutate(TT = case_when(id_edital =="edital17.2016" & is.na(TT1) ~ "C172016", 
                                                                       id_edital =="edital20.2016" & is.na(TT1) ~ "C202016", 
                                                                       id_edital =="edital17.2016" & !is.na(TT1) ~ "T172016", 
                                                                       id_edital =="edital20.2016" & !is.na(TT1) ~ "T202016"))
table(attempted_inscritos2$TT)

#Stacking for individual-lottery format
data_ind_lottery_attempts <- attempted_inscritos2
stopifnot(length(unique(data_ind_lottery_attempts$cpfstd))==length(attempted)) 

#Adding Rais data
load(paste0(home, "DB/all_wide_rais_october.Rda"))

#Joining
W1_attrition_overall <- data_ind_lottery_attempts %>% left_join(all_wide_rais_all_editais, by = "cpfstd")

stopifnot(nrow(W1_attrition_overall)==nrow(data_ind_lottery_attempts))

#operationalize dependent variable (interviewed == 1, not interviewed == 0)

answered <- results_f2 %>% mutate(answered = ifelse(g1 == "Sim", 1, 0)) %>% filter(answered == 1)

#There are repeated interviews
tmp <- answered %>% filter(duplicated(cpfstd))
#Excluding repeated from survey
answered <- answered %>% filter(!cpfstd %in% tmp$cpfstd)

cpfs_answered <- unique(answered$cpfstd)

#Creating indicator

W1_attrition_overall <- W1_attrition_overall %>% mutate(interviewed = ifelse(cpfstd %in% cpfs_answered, 1, 0))

#Smell checks
table(W1_attrition_overall$interviewed)
table(is.na(W1_attrition_overall$interviewed))

#Treatment indicator
W1_attrition_overall <- W1_attrition_overall %>% mutate(treat_ind = ifelse(TT %in% c("T172016", "T202016"), 1, 0))

#Smell check
table(W1_attrition_overall$edital17.2016, W1_attrition_overall$edital20.2016)

#Getting sex indicator
tmp1 <- get_gender(W1_attrition_overall$names.clean, state = "RJ") #slow
tmp2 <- tibble(tmp1)
W1_attrition_overall <- bind_cols(W1_attrition_overall, tmp2)

#Adding weights
W1_attrition_overall <- W1_attrition_overall %>% left_join(survey_weights, by = "cpfstd")

#Creating id
W1_attrition_overall$fieldID  #redacted #create fake ids

#Making data anonymous

W1_attrition_overall <- W1_attrition_overall %>% select(-cpfstd, -names.x, -cpf.x, -ids, -cpf2.x, 
                                                        -unusable, -cpf3, -ind_notstd, -first_digit_valid,         
                                                        -second_digit_valid, -TT1, -cpf.y, -cpf2.y, 
                                                        -final_invalid, -rep, -names.y,
                                                        -names.clean, -sum_participation, -names.clean.17,
                                                        -names.clean.20,
                                                        -sum_participation_geral, -sum_participation_territorial, 
                                                        -sum_wins, -sum_wins_geral, -sum_wins_territorial )


W1_attrition_overall <- W1_attrition_overall %>% mutate(
  r2010 = ifelse(is.na(`2010`), 0, `2010`),
  r2011 = ifelse(is.na(`2011`), 0, `2011`),
  r2012 = ifelse(is.na(`2012`), 0, `2012`),
  r2013 = ifelse(is.na(`2013`), 0, `2013`), 
  r2014 = ifelse(is.na(`2014`), 0, `2014`), 
  sexor = dplyr::recode(tmp1, `Male` = 1, `Female` = 0))

# Create summary RAIS variables pre-treatment (only years up to 2010 are included)
W1_attrition_overall$av_pre <- rowSums(W1_attrition_overall[,grep("^r2\\d\\d\\d$",names(W1_attrition_overall))],na.rm=T)/5  
W1_attrition_overall$av_prel  <- log(1+W1_attrition_overall$av_pre)
W1_attrition_overall$formal_pre <- rowSums(W1_attrition_overall[,grep("^r2\\d\\d\\d$",names(W1_attrition_overall))]!=0)
W1_attrition_overall$formal_pre_prop <- rowSums(W1_attrition_overall[,grep("^r2\\d\\d\\d$",names(W1_attrition_overall))]!=0)/5

comment(W1_attrition_overall) <- paste("Dataset created in ",as.Date(Sys.Date()),".\n",sep="")
save(W1_attrition_overall, file = "DB/W1_attrition_overall.Rda")

#Saving public dataset
W1_attrition_overall <- W1_attrition_overall %>% select(id_edital, TT, interviewed, treat_ind,         
                                                        sampling_prob, fieldID, treat_ind,   
                                                        sexor, av_pre, av_prel, formal_pre, formal_pre_prop)

save(W1_attrition_overall, file = paste0(home, "DB/W1_attrition_overall_public.Rda"))


##############Among those who picked up (and were our respondents)
#dependent variable: interviewed, did not get interviewed
#unit of analysis is also individual-lottery

#create individual-lottery format

#Getting those who picked up
pickedup <- results_f2 %>% filter(!is.na(b2))
pickedup_cpfs <- unique(pickedup$cpfstd) 

#Select lotteries inscritos
pu_inscritos <- inscritosf %>% filter(cpfstd %in% pickedup_cpfs)
stopifnot(length(unique(pu_inscritos$cpfstd))==length(pickedup_cpfs)) 
pu_inscritos <- pu_inscritos %>% filter(id_edital %in% c("edital17.2016", "edital20.2016"))

table(pu_inscritos$id_edital) #smell check

#Selecting lotteries winners in the attempts
pu_winners <- sorteadosf %>% filter(cpfstd %in% pickedup_cpfs)
pu_winners <- pu_winners %>% filter(id_edital %in% c("edital17.2016", "edital20.2016"))
table(pu_winners$id_edital) #smell check

#Creating TT labels
pu_winners <- pu_winners %>% mutate(TT1 = case_when(id_edital == "edital17.2016" ~ "T172016", 
                                                    id_edital == "edital20.2016" ~ "T202016")) %>% select(cpfstd, names, TT1, id_edital)

pu_inscritos2 <-  left_join(pu_inscritos, pu_winners, by = c("cpfstd", "id_edital"))
nrow(pu_inscritos2) == nrow(pu_inscritos) 

pu_inscritos2 <- pu_inscritos2 %>% mutate(TT = case_when(id_edital =="edital17.2016" & is.na(TT1) ~ "C172016", 
                                                         id_edital =="edital20.2016" & is.na(TT1) ~ "C202016", 
                                                         id_edital =="edital17.2016" & !is.na(TT1) ~ "T172016", 
                                                         id_edital =="edital20.2016" & !is.na(TT1) ~ "T202016"))
table(pu_inscritos2$TT)

#Stacking for individual-lottery format
data_ind_lottery_pickedup <- pu_inscritos2
stopifnot(length(unique(data_ind_lottery_pickedup$cpfstd))==length(pickedup_cpfs)) 

#Adding Rais data
load(paste0(home, "DB/all_wide_rais_october.Rda"))

#Joining
W1_attrition <- data_ind_lottery_pickedup %>% left_join(all_wide_rais_all_editais, by = "cpfstd")

stopifnot(nrow(W1_attrition)==nrow(data_ind_lottery_pickedup))

#operationalize dependent variable (interviewed == 1, refused == 0)

interviewed <- results_f2 %>% mutate(answered = ifelse(g1 == "Sim", 1, 0)) %>% filter(answered == 1)

#There are repeated interviews
tmp <- answered %>% filter(duplicated(cpfstd))
#Excluding repeated from survey
answered <- answered %>% filter(!cpfstd %in% tmp$cpfstd)

cpfs_answered <- unique(answered$cpfstd)

refused <-   results_f2 %>% filter(b2 == 3 | c1 %in% c(2, 3))
cpfs_refused <- unique(refused$cpfstd)

#Creating indicator

W1_attrition <- W1_attrition %>% mutate(interviewed = ifelse(cpfstd %in% cpfs_answered, 1,
                                                       ifelse(cpfstd  %in% cpfs_refused, 0, NA)))

#Smell checks
table(W1_attrition$interviewed)
table(is.na(W1_attrition$interviewed)) #NAs are scheduled

#Remove NAs (scheduled)
W1_attrition <- W1_attrition %>% filter(!is.na(interviewed))

#Treatment indicator
W1_attrition <- W1_attrition %>% mutate(treat_ind = ifelse(TT %in% c("T172016", "T202016"), 1, 0))

#Smell check
table(W1_attrition$edital17.2016, W1_attrition$edital20.2016)

#Survey weights
W1_attrition <- W1_attrition %>% left_join(survey_weights, by = "cpfstd")

#Getting sex indicator
tmp1 <- get_gender(W1_attrition$names.clean, state = "RJ")
tmp2 <- tibble(tmp1)
W1_attrition <- bind_cols(W1_attrition, tmp2)

#Creating id
W1_attrition$fieldID   #redacted #create fake ids

#Making data anonymous

W1_attrition <- W1_attrition %>% select(-cpfstd, -names.x, -cpf.x, -ids, -cpf2.x, 
                                                        -unusable, -cpf3, -ind_notstd, -first_digit_valid,         
                                                        -second_digit_valid, -TT1, -cpf.y, 
                                                        -final_invalid, -rep, -names.y,
                                                        -names.clean, -sum_participation, 
                                                        -sum_participation_geral, -sum_participation_territorial, 
                                                        -sum_wins, -sum_wins_geral, -sum_wins_territorial, -names.clean.17, -names.clean.20, -cpf2.y)


W1_attrition <- W1_attrition %>% mutate(
  r2010 = ifelse(is.na(`2010`), 0, `2010`),
  r2011 = ifelse(is.na(`2010`), 0, `2011`),
  r2012 = ifelse(is.na(`2010`), 0, `2012`),
  r2013 = ifelse(is.na(`2013`), 0, `2013`),
  r2014 = ifelse(is.na(`2014`), 0, `2014`),
  sexor = dplyr::recode(tmp1, `Male` = 1, `Female` = 0))

# Create summary RAIS variables pre-treatment (only years up to 2010 are included)
W1_attrition$av_pre <- rowSums(W1_attrition[,grep("^r2\\d\\d\\d$",names(W1_attrition))], na.rm=T)/5
W1_attrition$av_prel  <- log(1+W1_attrition$av_pre)
W1_attrition$formal_pre <- rowSums(W1_attrition[,grep("^r2\\d\\d\\d$",names(W1_attrition))]!=0)
W1_attrition$formal_pre_prop <- rowSums(W1_attrition[,grep("^r2\\d\\d\\d$",names(W1_attrition))]!=0)/5

comment(W1_attrition) <- paste("Dataset created in ",as.Date(Sys.Date()),".\n",sep="")
save(W1_attrition, file = paste0(home, "DB/W1_attrition.Rda"))

#Saving public dataset
W1_attrition <- W1_attrition %>% select(id_edital, TT, interviewed, treat_ind,       
                                                        sampling_prob, fieldID, treat_ind,   
                                                        sexor, av_pre, av_prel, formal_pre, formal_pre_prop)

save(W1_attrition, file = paste0(home, "DB/W1_attrition_public.Rda"))

